{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VBFvXsCOxHeV"
      },
      "source": [
        "#  Vision-based End-to-End Driving Tutorial\n",
        "\n",
        "- Website: https://waymo.com/open\n",
        "- GitHub: https://github.com/waymo-research/waymo-open-dataset\n",
        "- Challenge: https://waymo.com/open/challenges/2025/e2e-driving/\n",
        "\n",
        "This tutorial demonstrates how to load, visualize and submit end-to-end driving data. Visit the [Waymo Open Dataset Website](https://waymo.com/open) to download the full dataset.\n",
        "\n",
        "To use, open this notebook in [Colab](https://colab.research.google.com).\n",
        "\n",
        "Uncheck the box \"Reset all runtimes before running\" if you run this colab directly from the remote kernel. Alternatively, you can make a copy before trying to run it by following \"File \u003e Save copy in Drive ...\".\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BBYit3Qhxw3E"
      },
      "source": [
        "## Package installation 🛠️"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mOGwtE4EyZmY"
      },
      "outputs": [],
      "source": [
        "!pip install waymo-open-dataset-tf-2-12-0==1.6.7"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vBAbHAxuwmic"
      },
      "outputs": [],
      "source": [
        "from typing import Tuple\n",
        "import matplotlib.pyplot as plt\n",
        "import tensorflow as tf\n",
        "import os\n",
        "import math\n",
        "import numpy as np\n",
        "import cv2\n",
        "from waymo_open_dataset import dataset_pb2 as open_dataset\n",
        "from waymo_open_dataset.wdl_limited.camera.ops import py_camera_model_ops\n",
        "\n",
        "from waymo_open_dataset.protos import end_to_end_driving_data_pb2 as wod_e2ed_pb2\n",
        "from waymo_open_dataset.protos import end_to_end_driving_submission_pb2 as wod_e2ed_submission_pb2"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aLqKE2xj-tHH"
      },
      "source": [
        "## Loading the data\n",
        "\n",
        "Visit the [Waymo Open Dataset Website](https://waymo.com/open/) to download the\n",
        "full dataset."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9hkOKUd6w8QG"
      },
      "outputs": [],
      "source": [
        "# Replace this path with your own tfrecords.\n",
        "# This tutorial is based on using data in the E2E Driving proto format directly,\n",
        "# so choose the correct dataset version.\n",
        "DATASET_FOLDER = '/waymo_open_dataset_'\n",
        "\n",
        "TRAIN_FILES = os.path.join(DATASET_FOLDER, 'training.tfrecord*')\n",
        "VALIDATION_FILES = os.path.join(DATASET_FOLDER, 'validation.tfrecord*')\n",
        "TEST_FILES = os.path.join(DATASET_FOLDER, 'test.tfrecord*')\n"]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XgTIikH9Bhv6"
      },
      "source": [
        "Initialize dataset object."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mV_m-oc2Bbsn"
      },
      "outputs": [],
      "source": [
        "filenames = tf.io.matching_files(VALIDATION_FILES)\n",
        "dataset = tf.data.TFRecordDataset(filenames, compression_type='')\n",
        "dataset_iter = dataset.as_numpy_iterator()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RWj7JeomCy1s"
      },
      "source": [
        "Retrieve one example."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3GIvytJ6BqHi"
      },
      "outputs": [],
      "source": [
        "bytes_example = next(dataset_iter)\n",
        "data = wod_e2ed_pb2.E2EDFrame()\n",
        "data.ParseFromString(bytes_example)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CaXCcjcVP60q"
      },
      "source": [
        "## Visualizing the future trajectories on image\n",
        "In this tutorial, we will visualize a single camera image and project the trajectory on the three front cameras."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rLv0k_XlTULt"
      },
      "outputs": [],
      "source": [
        "def return_front3_cameras(data: wod_e2ed_pb2.E2EDFrame):\n",
        "  \"\"\"Return the front_left, front, and front_right cameras as a list of images\"\"\"\n",
        "  image_list = []\n",
        "  calibration_list = []\n",
        "  # CameraName Enum reference:\n",
        "  # https://github.com/waymo-research/waymo-open-dataset/blob/5f8a1cd42491210e7de629b6f8fc09b65e0cbe99/src/waymo_open_dataset/dataset.proto#L50\n",
        "  order = [2, 1, 3]\n",
        "  for camera_name in order:\n",
        "    for index, image_content in enumerate(data.frame.images):\n",
        "      if image_content.name == camera_name:\n",
        "        # Decode the raw image string and convert to numpy type.\n",
        "        calibration = data.frame.context.camera_calibrations[index]\n",
        "        image = tf.io.decode_image(image_content.image).numpy()\n",
        "        image_list.append(image)\n",
        "        calibration_list.append(calibration)\n",
        "        break\n",
        "\n",
        "  return image_list, calibration_list"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7Yfg6ceNTJwq"
      },
      "source": [
        "Visualize the front 3 cameras"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xFePW7Y5WZik"
      },
      "outputs": [],
      "source": [
        "front3_camera_image_list, front3_camera_calibration_list = return_front3_cameras(data)\n",
        "concatenated_image = np.concatenate(front3_camera_image_list, axis=1)\n",
        "plt.figure(figsize=(20, 20))\n",
        "plt.imshow(concatenated_image)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UV6sHoX1w2-k"
      },
      "outputs": [],
      "source": [
        "def project_vehicle_to_image(vehicle_pose, calibration, points):\n",
        "  \"\"\"Projects from vehicle coordinate system to image with global shutter.\n",
        "\n",
        "  Arguments:\n",
        "    vehicle_pose: Vehicle pose transform from vehicle into world coordinate\n",
        "      system.\n",
        "    calibration: Camera calibration details (including intrinsics/extrinsics).\n",
        "    points: Points to project of shape [N, 3] in vehicle coordinate system.\n",
        "\n",
        "  Returns:\n",
        "    Array of shape [N, 3], with the latter dimension composed of (u, v, ok).\n",
        "  \"\"\"\n",
        "  # Transform points from vehicle to world coordinate system (can be\n",
        "  # vectorized).\n",
        "  pose_matrix = np.array(vehicle_pose.transform).reshape(4, 4)\n",
        "  world_points = np.zeros_like(points)\n",
        "  for i, point in enumerate(points):\n",
        "    cx, cy, cz, _ = np.matmul(pose_matrix, [*point, 1])\n",
        "    world_points[i] = (cx, cy, cz)\n",
        "\n",
        "  # Populate camera image metadata. Velocity and latency stats are filled with\n",
        "  # zeroes.\n",
        "  extrinsic = tf.reshape(\n",
        "      tf.constant(list(calibration.extrinsic.transform), dtype=tf.float32),\n",
        "      [4, 4])\n",
        "  intrinsic = tf.constant(list(calibration.intrinsic), dtype=tf.float32)\n",
        "  metadata = tf.constant([\n",
        "      calibration.width,\n",
        "      calibration.height,\n",
        "      open_dataset.CameraCalibration.GLOBAL_SHUTTER,\n",
        "  ],\n",
        "                         dtype=tf.int32)\n",
        "  camera_image_metadata = list(vehicle_pose.transform) + [0.0] * 10\n",
        "\n",
        "  # Perform projection and return projected image coordinates (u, v, ok).\n",
        "  return py_camera_model_ops.world_to_image(extrinsic, intrinsic, metadata,\n",
        "                                            camera_image_metadata,\n",
        "                                            world_points).numpy()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UAgH_pmoxrJo"
      },
      "outputs": [],
      "source": [
        "def draw_points_on_image(image, points, size):\n",
        "  \"\"\"Draws points on an image.\n",
        "\n",
        "  Args:\n",
        "    image: The image to draw on.\n",
        "    points: A numpy array of shape (N, 2) representing the points to draw.\n",
        "  \"\"\"\n",
        "  for point in points:\n",
        "    cv2.circle(image, (int(point[0]), int(point[1])), size, (255, 0, 0), -1)\n",
        "  return image"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nL0uiCL1G4y6"
      },
      "source": [
        "Extract the ego vehicle's future trajectory and reshape to (N, 3) matrix."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Go_GRAlVpQJ6"
      },
      "outputs": [],
      "source": [
        "future_waypoints_matrix = np.stack([data.future_states.pos_x, data.future_states.pos_y, data.future_states.pos_z], axis=1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MrEuJN4Lm5-7"
      },
      "source": [
        "The pose is always an identity matrix as we already convert world coordinates to vehicle coordinates."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kqpRRhoGjbGb"
      },
      "outputs": [],
      "source": [
        "vehicle_pose = data.frame.images[0].pose"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tucgPv_UoDTp"
      },
      "source": [
        "We convert the ego vehicle's future waypoints to camera space and draw on camera."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UKzDz0NLezyG"
      },
      "outputs": [],
      "source": [
        "images_with_drawn_points = []\n",
        "for i in range(len(front3_camera_calibration_list)):\n",
        "  waypoints_camera_space = project_vehicle_to_image(vehicle_pose, front3_camera_calibration_list[i], future_waypoints_matrix)\n",
        "  images_with_drawn_points.append(draw_points_on_image(front3_camera_image_list[i], waypoints_camera_space, size=15))\n",
        "concatenated_image = np.concatenate(images_with_drawn_points, axis=1)\n",
        "plt.figure(figsize=(20, 20))\n",
        "plt.imshow(concatenated_image)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vUh5UBgM4rv-"
      },
      "source": [
        "## Submission generation\n",
        "\n",
        "The `wod_e2ed_submission_pb2` defines the proto format of the submission.\n",
        "\n",
        "The participants are required to produce **a single trajectory** starting after the last provided frame. The trajectory should follow  `TrajectoryPrediction` format and has a length of 5 seconds and a frequency of 4 HZ. Then the participants should add the corresponding frame name to form `FrameTrajectoryPredictions`.  The evaluation server will compute detailed metrics and add them to the leaderboard.\n",
        "\n",
        "This section will demonstrate how submission file is created"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "x1SJS4kc_mot"
      },
      "outputs": [],
      "source": [
        "# Assume we have a predicted stopping trajectory.\n",
        "predicted_trajectory = wod_e2ed_submission_pb2.TrajectoryPrediction(pos_x=np.zeros(20, dtype=np.float32),\n",
        "                                                                    pos_y=np.zeros(20, dtype=np.float32))\n",
        "frame_name = data.frame.context.name\n",
        "frame_trajectory = wod_e2ed_submission_pb2.FrameTrajectoryPredictions(frame_name=frame_name, trajectory=predicted_trajectory)\n",
        "# The final prediction should be a list of FrameTrajectoryPredictions.\n",
        "predictions = [frame_trajectory]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dtAFtomxBFpg"
      },
      "outputs": [],
      "source": [
        "# Pack for submission.\n",
        "num_submission_shards = 1  # Please modify accordingly.\n",
        "submission_file_base = '/tmp/MySubmission'  # Please modify accordingly.\n",
        "if not os.path.exists(submission_file_base):\n",
        "  os.makedirs(submission_file_base)\n",
        "sub_file_names = [\n",
        "    os.path.join(submission_file_base, part)\n",
        "    for part in [f'part{i}' for i in range(num_submission_shards)]\n",
        "]\n",
        "# As the submission file may be large, we shard them into different chunks.\n",
        "submissions = []\n",
        "num_predictions_per_shard =  math.ceil(len(predictions) / num_submission_shards)\n",
        "for i in range(num_submission_shards):\n",
        "  start = i * num_predictions_per_shard\n",
        "  end = (i + 1) * num_predictions_per_shard\n",
        "  submissions.append(\n",
        "      wod_e2ed_submission_pb2.E2EDChallengeSubmission(\n",
        "          predictions=predictions[start:end]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dqwJXQhdCcGn"
      },
      "outputs": [],
      "source": [
        "for i, shard in enumerate(submissions):\n",
        "  shard.submission_type  =  wod_e2ed_submission_pb2.E2EDChallengeSubmission.SubmissionType.E2ED_SUBMISSION\n",
        "  shard.authors[:] = ['A', 'B']  # Please modify accordingly.\n",
        "  shard.affiliation = 'Affiliation'  # Please modify accordingly.\n",
        "  shard.account_name = 'acc@domain.com'  # Please modify accordingly.\n",
        "  shard.unique_method_name = 'YourMethodName'  # Please modify accordingly.\n",
        "  shard.method_link = 'method_link'  # Please modify accordingly.\n",
        "  shard.description = ''  # Please modify accordingly.\n",
        "  shard.uses_public_model_pretraining = True # Please modify accordingly.\n",
        "  shard.public_model_names.extend(['Model_name']) # Please modify accordingly.\n",
        "  shard.num_model_parameters = \"200k\" # Please modify accordingly.\n",
        "  with tf.io.gfile.GFile(sub_file_names[i], 'wb') as fp:\n",
        "    fp.write(shard.SerializeToString())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TqmlWbYFEPLO"
      },
      "outputs": [],
      "source": [
        "print(submissions[0])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MLVzBJxXw33V"
      },
      "source": [
        "## Package submission\n",
        "```\n",
        "cd /tmp\n",
        "tar cvf MySubmission.tar MySubmission\n",
        "gzip MySubmission.tar\n",
        "```\n",
        "Then you can upload `/tmp/MySubmission.tar.gz` to the challenge website.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZaQ6UFQaz4fH"
      },
      "source": [
        "## Evaluation\n",
        "Once the predictions are submitted, our eval server will run the [rater feedback metric](https://waymo.com/intl/en_us/open/challenges/2025/e2e-driving/) to compute the rater feedback scores and update the leaderboard. As the rater feedback metric code won't be released, here we provide a simple ADE Metric implementation to help participants self-evaluate."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gMeLb0pX6s1e"
      },
      "outputs": [],
      "source": [
        "def average_distance_per_step(\n",
        "    predictions , observed_traj, mask\n",
        ") :\n",
        "  \"\"\"Compute L2 distance between proposed trajectories and ground truth.\n",
        "\n",
        "  Args:\n",
        "    predictions: A numpy array representing model predictions of size: [# proposals,\n",
        "      # time steps, spatial features].\n",
        "    observed_traj: A tensor representing the observed trajectory in the logs of\n",
        "      size [# time steps, spatial features]\n",
        "    mask: A boolean tensor representing the time steps that have valid\n",
        "      observations of size [# time steps].\n",
        "\n",
        "  Returns:\n",
        "    A tensor of size [# proposals]\n",
        "  \"\"\"\n",
        "  dist_per_step = np.linalg.norm(\n",
        "      predictions - observed_traj[np.newaxis], axis=-1\n",
        "  )\n",
        "  dist_per_traj = (dist_per_step * mask[np.newaxis]).sum(axis=-1)\n",
        "  valid_steps = np.maximum(mask.sum(axis=-1, keepdims=True), 1.0)\n",
        "  avg_distance = dist_per_traj / valid_steps\n",
        "  return avg_distance\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yeNkBSQI_A8h"
      },
      "source": [
        "Convert both gt and predictions to a format of dictionary for convenience."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UyHfj-pz_AU5"
      },
      "outputs": [],
      "source": [
        "# {frame_name: trajectory}\n",
        "gt_dict = {}\n",
        "prediction_dict = {}\n",
        "gt_dict[data.frame.context.name] =  np.stack([data.future_states.pos_x, data.future_states.pos_y], axis=1)\n",
        "for submission in submissions:\n",
        "  for prediction in submission.predictions:\n",
        "    prediction_dict[prediction.frame_name] = np.stack([prediction.trajectory.pos_x, prediction.trajectory.pos_y], axis=1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rJhRZ2knDKOe"
      },
      "outputs": [],
      "source": [
        "# Compute ADEs. Our ade calculation is generalized to multiple proposal and masking enabled.\n",
        "ade_list = []\n",
        "for frame_name in gt_dict:\n",
        "  if frame_name not in prediction_dict:\n",
        "    raise ValueError(f'No prediction for {frame_name}')\n",
        "  gt_traj = gt_dict[frame_name]\n",
        "  pred_traj = prediction_dict[frame_name]\n",
        "  mask = np.ones(gt_traj.shape[0], dtype=np.bool_)\n",
        "  ade_list.append(average_distance_per_step(pred_traj[None], gt_traj, mask)[0])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UYkS4cgDA0B_"
      },
      "outputs": [],
      "source": [
        "print(f'ADE: {np.mean(ade_list)}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2VrGnxGulZLw"
      },
      "outputs": [],
      "source": [
        "# @title The following code shows how to calculate the rater feedback metric.\n",
        "# First, we need to iterate over the dataset to find the frame that contains\n",
        "# the rated trajectory. Usually in a whole run, only one frame contains such label.\n",
        "data_contain_label = None\n",
        "for raw_data in dataset_iter:\n",
        "  data = wod_e2ed_pb2.E2EDFrame()\n",
        "  data.ParseFromString(raw_data)\n",
        "  if len(data.preference_trajectories) \u003e 0 and \\\n",
        "    data.preference_trajectories[0].preference_score != -1:\n",
        "    data_contain_label = data\n",
        "    break"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cH_MFUZSnqZm"
      },
      "outputs": [],
      "source": [
        "print(data_contain_label.frame.context.name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Mf_BT47BDw5K"
      },
      "outputs": [],
      "source": [
        "# @title The following code shows how to calculate the rater feedback metric.\n",
        "from waymo_open_dataset.metrics.python import rater_feedback_utils\n",
        "# Assume we have a list of data.\n",
        "data_list = [data_contain_label]\n",
        "rater_specified_trajectories = []\n",
        "rater_scores = []\n",
        "initial_speed = []\n",
        "\n",
        "prediction_trajectories = []\n",
        "prediction_probabilities = []\n",
        "\n",
        "for i in range(len(data_list)):\n",
        "  rater_specified_trajs_and_scores_i = data_list[i].preference_trajectories\n",
        "  current_rater_trajs = []\n",
        "  current_rater_scores = []\n",
        "  for j in range(len(rater_specified_trajs_and_scores_i)):\n",
        "    current_rater_trajs.append(\n",
        "        np.stack(\n",
        "            [\n",
        "                rater_specified_trajs_and_scores_i[j].pos_x,\n",
        "                rater_specified_trajs_and_scores_i[j].pos_y,\n",
        "            ],\n",
        "            axis=-1,\n",
        "        )\n",
        "    )\n",
        "    current_rater_scores.append(rater_specified_trajs_and_scores_i[j].preference_score)\n",
        "  current_rater_scores = np.array(current_rater_scores)\n",
        "\n",
        "  # Initial speed calculation. The last position of past velocity should be\n",
        "  # the current velocity.\n",
        "  vel_x = data.past_states.vel_x[-1]\n",
        "  vel_y = data.past_states.vel_y[-1]\n",
        "  initial_speed.append(np.sqrt(vel_x**2 + vel_y**2))\n",
        "\n",
        "  # Fake prediction.\n",
        "  prediction_traj = np.zeros((20, 2))\n",
        "  # We add an empty axis as the # of proposals is 1\n",
        "  prediction_trajectories.append(prediction_traj[None])\n",
        "  prediction_probabilities.append(np.ones(1))\n",
        "  # Append the current trajectory and score to the batch list.\n",
        "  rater_specified_trajectories.append(current_rater_trajs)\n",
        "  rater_scores.append(current_rater_scores)\n",
        "\n",
        "# Convert the list of numpy array to add batch dimension.\n",
        "initial_speed = np.stack(initial_speed)\n",
        "prediction_trajectories = np.stack(prediction_trajectories)\n",
        "prediction_probabilities = np.stack(prediction_probabilities)\n",
        "\n",
        "rater_feedback_metrics = (\n",
        "        rater_feedback_utils.get_rater_feedback_score(\n",
        "            prediction_trajectories,\n",
        "            prediction_probabilities,\n",
        "            rater_specified_trajectories,\n",
        "            rater_scores,\n",
        "            initial_speed,\n",
        "            frequency=4,  # Default is 4.\n",
        "            length_seconds=5, # Default predict 5 seconds.\n",
        "            output_trust_region_visualization=False,\n",
        "        )\n",
        "    )\n",
        "print(rater_feedback_metrics['rater_feedback_score'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "I3q2oFEpqE1y"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "last_runtime": {
        "build_target": "",
        "kind": "local"
      },
      "private_outputs": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
